# “Income inequality is unrelated to perceived inequality and support for redistribution” by Kris-Stella Trump, forthcoming in Social Science Quarterly

# Thank you to Riley Carney for her contributions to this code
# Written in R version 4.0
# Last update: 2020.06.30

# The code in this file prepares the publicly available ISSP data files for analysis

#### Set-up ####

library(Hmisc)
library(weights)
library(tidyverse)
library(foreign)

#####Start by loading the merged ISSP dataset 1987-2014

#Data citation:
#ISSP Research Group (2014): International Social Survey Programme: Social Inequality I-IV - ISSP 1987-1992-1999-2009. GESIS Data Archive, Cologne. ZA5890 Data file Version 1.0.0, doi:10.4232/1.11911

#Load main data
issp <- read.dta("Main data ZA5890_v1-0-0.dta", convert.factors = FALSE)
names(issp)
#To keep informative country-year names, read in country-year variable as factor
issp_factor  <- read.dta("Main data ZA5890_v1-0-0.dta", convert.factors = TRUE)
issp$V7 <- issp_factor$V7
#also use this for country variable
issp$country <- issp_factor$V5
#and make sure year is labelled clearly
issp$year <- issp$V4



##### Also load add-on dataset with income perceptions and preferences
#Data citation: 
#ISSP Research Group (2014): International Social Survey Programme: Social Inequality I-IV ADD ON - ISSP 1987-1992-1999-2009. GESIS Data Archive, Cologne. ZA5891 Data file Version 1.1.0, doi:10.4232/1.12041
addon <- read.dta("Add-on data ZA5891_v1-1-0.dta")
names(addon)

#### Income is and ought questions in addon file #### 

#V8 - Actually earn doctor
#V9 - Actually earn CEO
#V11 - Actually earn: Unskilled worker in a factory
#V12 - actually earn cabinet minister
#V23 Should earn doctor
#V24: Should earn: CEO
#V26 Should earn: unskilled
#V27 Should earn: cabinet minister
summary(addon$V8)
summary(addon$V9)
summary(addon$V11)
summary(addon$V12)
summary(addon$V23)
summary(addon$V24)
summary(addon$V26)
summary(addon$V27)

# Per codebook, ISSP codes don't knows and refusals at negative numbers, but here there are no <0 entries and a fair number of NA's. Conclude that NA's have been set as such already.

#Some country-years have particularities in how the answers have been coded that are not listed in the codebook
#(see https://zacat.gesis.org/webview/index.jsp?object=http://zacat.gesis.org/obj/fStudy/ZA5891 and navigate to the substantive variables)
#Pull up a summary of the country-years in this dataset
summary(addon$V7)

#Italy 1987 - 0 means NA
summary(addon[addon$V7=="IT_1987","V8"])
summary(addon[addon$V7=="IT_1987","V9"])
summary(addon[addon$V7=="IT_1987","V11"])
summary(addon[addon$V7=="IT_1987","V12"])
summary(addon[addon$V7=="IT_1987","V23"])
summary(addon[addon$V7=="IT_1987","V24"])
summary(addon[addon$V7=="IT_1987","V26"])
summary(addon[addon$V7=="IT_1987","V27"])
#It appears these questions were not asked, and this is already reflected in the data which is set to NA

#Australia 1987 - coded to bottom rather than midpoints of available bins
table(addon[addon$V7=="AU_1987","V8"])
table(addon[addon$V7=="AU_1987","V9"])
table(addon[addon$V7=="AU_1987","V11"])
table(addon[addon$V7=="AU_1987","V12"])
table(addon[addon$V7=="AU_1987","V23"])
table(addon[addon$V7=="AU_1987","V24"])
table(addon[addon$V7=="AU_1987","V26"])
table(addon[addon$V7=="AU_1987","V27"])

#Not all expected bins appear, but that is probably because no respondents picked the "missing" bins. 
#The problem with changing this (bottom rather than midpoint coding) up is that people are more likely to give round numbers; the existing coding is therefore likely to be closer to the true value than the midpoint of the bin
#Choose to leave this as-is

#In 1987, Australia, Germany (DE-W_1987), US, and Poland top-coded the answers
addon$topcoded <- addon$V7==c("AU_1987") | addon$V7==c("DE-W_1987") | addon$V7==c("PL_1987") | addon$V7==c("US_1987")

#US 1987 has some mixed coding; 999992, 999993, 999994 = more than now paid etc (see codebook). 999996 = more than $1m
table(addon[addon$V7=="US_1987", "V23"])
table(addon[addon$V7=="US_1987", "V24"])
table(addon[addon$V7=="US_1987", "V26"])
table(addon[addon$V7=="US_1987", "V27"])

#these sums only used for the us:
sum(addon$V23==999992, na.rm=T)
sum(addon$V23==999993, na.rm=T)
sum(addon$V23==999994, na.rm=T)

sum(addon$V24==999992, na.rm=T)
sum(addon$V24==999993, na.rm=T)
sum(addon$V24==999994, na.rm=T)

sum(addon$V26==999992, na.rm=T)
sum(addon$V26==999993, na.rm=T)
sum(addon$V26==999994, na.rm=T)

sum(addon$V27==999992, na.rm=T)
sum(addon$V27==999993, na.rm=T)
sum(addon$V27==999994, na.rm=T)

#set these to missing.

addon$V23 <-  na_if(addon$V23, 999992)
addon$V23 <-  na_if(addon$V23, 999993)
addon$V23 <-  na_if(addon$V23, 999994)

addon$V24 <-  na_if(addon$V24, 999992)
addon$V24 <-  na_if(addon$V24, 999993)
addon$V24 <-  na_if(addon$V24, 999994)

addon$V26 <-  na_if(addon$V26, 999992)
addon$V26 <-  na_if(addon$V26, 999993)
addon$V26 <-  na_if(addon$V26, 999994)

addon$V27 <-  na_if(addon$V27, 999992)
addon$V27 <-  na_if(addon$V27, 999993)
addon$V27 <-  na_if(addon$V27, 999994)


#Some countries asked for incomes before taxes; others for income after. Add indicator for after taxes (ref category includes before taxes and unspecified)
addon$aftertaxes <- ifelse(addon$V7=="CH_1987" | addon$V7=="CH_2009" | addon$V7=="ES_2009" | addon$V7=="IL_1999" | addon$V7=="IL_2009" | addon$V7=="IT_2009" | addon$V7=="JP_2009" | addon$V7=="LV_1999" | addon$V7=="LV_2009" | addon$V7=="PL_1987" | addon$V7=="PL_1999" | addon$V7=="PL_2009" | addon$V7=="PT_1999" | addon$V7=="PT_2009" | addon$V7=="SI_1992" | addon$V7=="SI_1999" | addon$V7=="SI_2009" | addon$V7=="SK_1999" | addon$V7=="SK_2009", 1, 0)
table(addon$V7, addon$aftertaxes)
#Note also that the following countries have specified pre and post in different years:
#Italy (change includes switch to Euro), Poland (outlier is 1992), Slovakia (outlier is 92).
#Given other changes in these economies in these years, can't use this reliably to estimate impact of wording.

### Create hh income quintile estimates ####

#Create household income variable that, as closely as possible, mimics quintiles on a within-country-year basis. Will use add-on dataset.
#There is huge variation in the answer options between countries and between years, so assigning any kind
#of cut-point is going to be largely artificial. Will use the .bincode function for this.

# I reassign the values of each factor level because they typically start at a higher number than 1. This
## isn't necessary because the categories "NA" and "country specific variable" don't contain any observations and don't
## interfere in how the quintiles are counted for each country. I reassign the values here in order to more easily assess how each variable
## is coerced into quintiles, but the results are equivalent without it.
summary(addon$AT_INC87)
levels(addon$AT_INC87) <- c(NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12", "13", "14", "15", "16", "17", "18", "19", "20", "21")
addon$AT_INC87 <- as.numeric(as.character(addon$AT_INC87))

summary(addon$AT_INC92)
levels(addon$AT_INC92) <- c(NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12", "13", "14", "15", "16", "17", "18", "19", "20")
addon$AT_INC92 <- as.numeric(as.character(addon$AT_INC92))

summary(addon$AT_INC99)
levels(addon$AT_INC99) <- c(NA, NA, "1","2","3","4","5","6","7")
addon$AT_INC99 <- as.numeric(as.character(addon$AT_INC99))

summary(addon$AT_INC09)
levels(addon$AT_INC09) <- c(NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12")
addon$AT_INC09 <- as.numeric(as.character(addon$AT_INC09))

summary(addon$AU_INC87) #actual value

summary(addon$AU_INC92) #actual value

summary(addon$AU_INC99)
levels(addon$AU_INC99) <- c(NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12","13","14", "15", "16", "17", "18")
addon$AU_INC99 <- as.numeric(as.character(addon$AU_INC99))

summary(addon$AU_INC09)
levels(addon$AU_INC09) <- c(NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12","13","14")
addon$AU_INC09 <- as.numeric(as.character(addon$AU_INC09))

summary(addon$BG_INC99) # actual value
summary(addon$BG_INC09) # actual value

summary(addon$CA_INC92)
levels(addon$CA_INC92) <- c(NA, NA, NA, "1","2","3","4","5","6","7","8")
addon$CA_INC92 <- as.numeric(as.character(addon$CA_INC92))

summary(addon$CA_INC99)
levels(addon$CA_INC99) <- c(NA, NA, NA, "1","2","3","4","5","6","7","8")
addon$CA_INC99 <- as.numeric(as.character(addon$CA_INC99))

summary(addon$CH_INC87)
levels(addon$CH_INC87) <- c(NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12")
addon$CH_INC87 <- as.numeric(as.character(addon$CH_INC87))

summary(addon$CH_INC09)
levels(addon$CH_INC09) <- c(NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11")
addon$CH_INC09 <- as.numeric(as.character(addon$CH_INC09))

summary(addon$CL_INC99)
levels(addon$CL_INC99) <- c(NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12","13","14")
addon$CL_INC99 <- as.numeric(as.character(addon$CL_INC99))

summary(addon$CL_INC09)
levels(addon$CL_INC09) <- c(NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12","13","14","15")
addon$CL_INC09 <- as.numeric(as.character(addon$CL_INC09))

summary(addon$CY_INC99)
levels(addon$CY_INC99) <- c(NA, NA, "1","2","3","4","5","6","7","8")
addon$CY_INC99 <- as.numeric(as.character(addon$CY_INC99))

summary(addon$CY_INC09)
levels(addon$CY_INC09) <- c(NA, NA, "1","2","3","4","5","6","7","8","9")
addon$CY_INC09 <- as.numeric(as.character(addon$CY_INC09))

summary(addon$CZ_INC92) #actual value
summary(addon$CZ_INC99) #actual value

summary(addon$CZ_INC09)
levels(addon$CZ_INC09) <- c(NA, NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18")
addon$CZ_INC09 <- as.numeric(as.character(addon$CZ_INC09))

summary(addon$DE_IN87b)
levels(addon$DE_IN87b) <- c(NA, NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18", "19", "20", "21", "22", "23")
addon$DE_IN87b <- as.numeric(as.character(addon$DE_IN87b))

summary(addon$DE_IN92b)
levels(addon$DE_IN92b) <- c(NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18", "19", "20", "21", "22")
addon$DE_IN92b <- as.numeric(as.character(addon$DE_IN92b))

#two alternative income measures for Germany are available. They are continuous but have lower response rates. Drop those in anticipation of grep use later in code.
addon$DE_IN87a <- NULL
addon$DE_IN92a <- NULL 

summary(addon$ES_INC99)
levels(addon$ES_INC99) <- c(NA, NA, "1","2","3","4","5","6","7","8","9")
addon$ES_INC99 <- as.numeric(as.character(addon$ES_INC99))

summary(addon$ES_INC09)
levels(addon$ES_INC09) <- c(NA, NA, "1","2","3","4","5","6","7","8","9", "10")
addon$ES_INC09 <- as.numeric(as.character(addon$ES_INC09))

summary(addon$FR_INC99)
levels(addon$FR_INC99) <- c(NA, NA, "1","2","3","4","5","6","7","8","9","10","11")
addon$FR_INC99 <- as.numeric(as.character(addon$FR_INC99))

summary(addon$FR_INC09)
levels(addon$FR_INC09) <- c(NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12")
addon$FR_INC09 <- as.numeric(as.character(addon$FR_INC09))

summary(addon$GB_INC87)
levels(addon$GB_INC87) <- c(NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12", "13")
addon$GB_INC87 <- as.numeric(as.character(addon$GB_INC87))

summary(addon$GB_INC92)
levels(addon$GB_INC92) <- c(NA, NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12", "13", "14")
addon$GB_INC92 <- as.numeric(as.character(addon$GB_INC92))
table(addon$GB_INC92)

summary(addon$GB_INC99)
levels(addon$GB_INC99) <- c(NA, NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12", "13", "14", "15", "16", "17", "18")
addon$GB_INC99 <- as.numeric(as.character(addon$GB_INC99))
table(addon$GB_INC99)

summary(addon$GB_INC09)
levels(addon$GB_INC09) <- c(NA, NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12", "13", "14", "15", "16", "17")
addon$GB_INC09 <- as.numeric(as.character(addon$GB_INC09))
table(addon$GB_INC09)

summary(addon$HU_INC87) # actual value
summary(addon$HU_INC92) # actual value
summary(addon$HU_INC99) # actual value
summary(addon$HU_INC09) # actual value

summary(addon$IL_INC09) 
levels(addon$IL_INC09) <- c(NA, NA, NA, NA, "1","2","3","4","5","6","7","8","9","10", "11")
addon$IL_INC09 <- as.numeric(as.character(addon$IL_INC09))

summary(addon$IT_INC87) 
levels(addon$IT_INC87) <- c(NA,"1","2","3","4","5","6","7","8","9","10","11","12", "13", "14", "15", "16", "17", "18")
addon$IT_INC87 <- as.numeric(as.character(addon$IT_INC87))

summary(addon$IT_INC92) 
levels(addon$IT_INC92) <- c(NA, NA,"1","2","3","4","5","6","7","8","9","10","11","12", "13", "14", "15", "16", "17", "18")
addon$IT_INC92 <- as.numeric(as.character(addon$IT_INC92))

summary(addon$IT_INC09) 
levels(addon$IT_INC09) <- c(NA, NA, NA, NA, "1","2","3","4","5","6","7","8","9","10")
addon$IT_INC09 <- as.numeric(as.character(addon$IT_INC09))

summary(addon$JP_INC99)
levels(addon$JP_INC99) <- c(NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12")
addon$JP_INC99 <- as.numeric(as.character(addon$JP_INC99))

summary(addon$JP_INC09)
levels(addon$JP_INC09) <- c(NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12")
addon$JP_INC09 <- as.numeric(as.character(addon$JP_INC09))

summary(addon$LV_INC99) # actual value
summary(addon$LV_INC09) # actual value

summary(addon$NO_INC92)
levels(addon$NO_INC92) <- c(NA, NA, "1","2","3","4","5","6","7","8","9")
addon$NO_INC92 <- as.numeric(as.character(addon$NO_INC92))

summary(addon$NO_INC99) # actual value
summary(addon$NO_INC09) # actual value

summary(addon$NZ_INC92)
levels(addon$NZ_INC92) <- c(NA, NA, "1","2","3","4","5","6","7","8","9")
addon$NZ_INC92 <- as.numeric(as.character(addon$NZ_INC92))

summary(addon$NZ_INC99)
levels(addon$NZ_INC99) <- c(NA, NA, "1","2","3","4","5","6","7","8","9")
addon$NZ_INC99 <- as.numeric(as.character(addon$NZ_INC99))

summary(addon$NZ_INC09)
levels(addon$NZ_INC09) <- c(NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11", "12", "13", "14")
addon$NZ_INC09 <- as.numeric(as.character(addon$NZ_INC09))

summary(addon$PH_INC99) # actual value
summary(addon$PH_INC09) # actual value

summary(addon$PL_INC92) # actual value
summary(addon$PL_INC99) # actual value
summary(addon$PL_INC09) # actual value

summary(addon$PT_INC99)
levels(addon$PT_INC99) <- c(NA, NA, NA, "1","2","3","4","5","6")
addon$PT_INC99 <- as.numeric(as.character(addon$PT_INC99))

summary(addon$PT_INC09)
levels(addon$PT_INC09) <- c(NA, NA, NA, NA, "1","2","3","4","5","6")
addon$PT_INC09 <- as.numeric(as.character(addon$PT_INC09))

summary(addon$RU_INC92) # actual value
summary(addon$RU_INC99) # actual value
summary(addon$RU_INC09) # actual value

summary(addon$SE_INC99) #actual value
summary(addon$SE_INC09) #actual value

summary(addon$SI_INC92) # actual value
summary(addon$SI_INC99) # actual value
summary(addon$SI_INC09) # actual value

summary(addon$SK_INC92) # actual value
summary(addon$SK_INC99) # actual value

summary(addon$SK_INC09)
levels(addon$SK_INC09) <- c(NA, NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11")
addon$SK_INC09 <- as.numeric(as.character(addon$SK_INC09))

summary(addon$US_INC87)
levels(addon$US_INC87) <- c(NA, NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12","13","14", "15", "16","17","18","19","20")
addon$US_INC87 <- as.numeric(as.character(addon$US_INC87))

summary(addon$US_INC92)
levels(addon$US_INC92) <- c(NA, NA, NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12","13","14", "15", "16","17","18","19","20", "21")
addon$US_INC92 <- as.numeric(as.character(addon$US_INC92))

summary(addon$US_INC99)
levels(addon$US_INC99) <- c(NA, NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12","13","14", "15", "16","17","18","19","20", "21", "22", "23")
addon$US_INC99 <- as.numeric(as.character(addon$US_INC99))

summary(addon$US_INC09)
levels(addon$US_INC09) <- c(NA, NA, NA, NA, "1","2","3","4","5","6","7","8","9","10","11","12","13","14", "15", "16","17","18","19","20","21","22","23","24","25","26")
addon$US_INC09 <- as.numeric(as.character(addon$US_INC09))


# generate a matrix for each year, start with 2009
income09.mat <- addon[,grep("_INC09", names(addon), value=TRUE)]

#take weights from main dataset
weights <- issp$WEIGHT

# Tell R where to place categories using the ".bincode" function, allowing for non-unique breaks
quintile.mat09 <- matrix(NA,nrow=nrow(income09.mat),ncol=ncol(income09.mat)) 
for(i in 1:ncol(income09.mat)){
  quintile.mat09[,i] <- as.integer(.bincode(income09.mat[,i], wtd.quantile(income09.mat[,i], weights=weights, 
                                                                             probs=0:5/5, na.rm=TRUE), include.lowest=TRUE))
}
# Check it:
summary(quintile.mat09[,1]) 
summary(addon$AT_INC09) # total number of NAs:  102802 -- checks out.

#Look at the bins
summary(quintile.mat09)

#collapse into one vector
income09 <- rowSums(quintile.mat09, na.rm=T) #sum across the columns to generate a single income value (NAs will drop)
income09[income09==0] <- NA # change zeros back to NAs
hist(income09)

#Same process for 1999
# create matrix
income99.mat <- addon[,grep("_INC99", names(addon), value=TRUE)]

quintile.mat99 <- matrix(NA,nrow=nrow(income99.mat),ncol=ncol(income99.mat)) # create empty matrix to put income data
dim(quintile.mat99)
for(i in 1:ncol(income99.mat)){
  quintile.mat99[,i] <- as.integer(.bincode(income99.mat[,i], wtd.quantile(income99.mat[,i], weights=weights, 
                                                                           probs=0:5/5, na.rm=TRUE), include.lowest=TRUE))
}
#Look at quintiles
summary(quintile.mat99)

income99 <- rowSums(quintile.mat99, na.rm=T)
income99[income99==0] <- NA 
hist(income99)

## 1992

income92.mat <- addon[,grep("_INC92", names(addon), value=TRUE)]

quintile.mat92 <- matrix(NA,nrow=nrow(income92.mat),ncol=ncol(income92.mat)) # create empty matrix to put income data
dim(quintile.mat92)
income92.mat <- sapply(income92.mat, as.numeric )
for(i in 1:ncol(income92.mat)){
  quintile.mat92[,i] <- as.integer(.bincode(income92.mat[,i], wtd.quantile(income92.mat[,i], weights=weights, 
                                                                           probs=0:5/5, na.rm=TRUE), include.lowest=TRUE))
}

summary(quintile.mat92)
income92 <- rowSums(quintile.mat92, na.rm=T)
income92[income92==0] <- NA 
hist(income92)

## 1987

income87.mat <- addon[,grep("_INC87", names(addon), value=TRUE)]

quintile.mat87 <- matrix(NA,nrow=nrow(income87.mat),ncol=ncol(income87.mat)) # create empty matrix to put income data
dim(quintile.mat87)
income87.mat <- sapply(income87.mat, as.numeric )
for(i in 1:ncol(income87.mat)){
  quintile.mat87[,i] <- as.integer(.bincode(income87.mat[,i], wtd.quantile(income87.mat[,i], weights=weights, 
                                                                           probs=0:5/5, na.rm=TRUE), include.lowest=TRUE))
}

summary(quintile.mat87)
income87 <- rowSums(quintile.mat87, na.rm=T)
income87[income87==0] <- NA 
hist(income87)


## add back in identifiers
identifiers <- issp[,c("V3", "V7")]
#check to make sure the order is the same
head(addon$V3)
head(issp$V3) # looks good

income.full <- cbind(income09,income99,income92,income87)
income.quintile <- rowSums(income.full, na.rm=T)
income.quintile[income.quintile==0] <- NA

income.full <- cbind(identifiers, income.quintile)
#double check
summary(income.full)
dim(income.full)
head(income.full)

# merge in with issp cumulative dataset
issp <- merge(issp, income.full, by=c("V7", "V3"))

#### Pay ratio measures and income variables ####

#Calculate CEO-factory worker ratio perception
addon$percratio <- addon$V9/addon$V11
summary(addon$percratio)

#Calculate CEO-factory worker ratio preference
addon$prefratio <- addon$V24/addon$V26
summary(addon$prefratio)

#Expect salaries as well as ratios to be approximated by log-normal distributions - do a couple of spot checks

#Percratio
addon %>% 
  filter(V7=="AU_2009") %>%
  ggplot(mapping = aes(percratio)) +
  geom_density()

addon %>% 
  filter(V7=="AU_2009") %>%
  ggplot(mapping = aes(log(percratio))) +
  geom_density()

#Salary perception - unskilled
addon %>% 
  filter(V7=="FR_2009") %>%
  ggplot(mapping = aes(V11)) +
  geom_density()

addon %>% 
  filter(V7=="FR_2009") %>%
  ggplot(mapping = aes(log(V11))) +
  geom_density()

#Salary prescription - CEO
addon %>% 
  filter(V7=="JP_1999") %>%
  ggplot(mapping = aes(V24)) +
  geom_density()

addon %>% 
  filter(V7=="JP_1999") %>%
  ggplot(mapping = aes(log(V24))) +
  geom_density()

#These are all ~log-normal. 

#Rename the salary variables to be more informative and allow merge with issp dataset without overwriting identically named variables there
addon <- addon %>% 
  rename(dr_is = V8, 
         ceo_is = V9,
         unsk_is = V11,
         minister_is = V12,
         dr_ought = V23,
         ceo_ought = V24,
         unsk_ought = V26,
         minister_ought = V27)

#pull out variables we want to merge into main ISSP dataset from the addon dataset
addonvars <- addon[,c("V7", "V3", "dr_is", "ceo_is", "unsk_is", "minister_is", "dr_ought", "ceo_ought", "unsk_ought", "minister_ought", "topcoded", "aftertaxes", "percratio", "prefratio")]

#merge these in with issp cumulative dataset 
issp <- merge(issp, addonvars, by=c("V7", "V3"))

#Some prior literature estimates GINIs and respondent positions in society using weighted ISCO measures. For example Kuhn 2019 uses an indicator for whether a respondent is in one of the top two ISCO major groupings. Will follow his approach and try to create an indicator for this.

#Take only indicator of whether the individual is in one of the top two ISCO major groupings
issp <- issp %>%
  mutate(ISCO_top = ifelse(ISCO88>999 & ISCO88<3000,1,0))

iscoshares <- issp %>% 
  group_by(V7) %>% 
  summarise(obs = n(), ISCOnas = sum(is.na(ISCO_top)), ISCOnashare = sum(is.na(ISCO_top))/n(), weightsum = sum(WEIGHT), ISCOweightsum = sum(ISCO_top*WEIGHT, na.rm=T),  ISCOtopshare = sum(ISCO_top*WEIGHT, na.rm=T)/sum(WEIGHT))
#there is a lot of missingngess, and a lot of variation in missingness here. A few country-years did not ask the question. 
#Several country-years have missingness near or over 50%
plot(iscoshares$ISCOnashare, iscoshares$ISCOtopshare)
#This shows a negative relationship between rate of missingness and share of population estimated to be in top 2 categories 
#conclude this would be better estimated after multiple imputation which includes income quintile and topbot as further info on this
#also violates missing at random assumptions - use either before or after imputation should be viewed quite carefully
#Conclude this is difficult to carry out reliably, but in case I want to do something with this later, carry over some minimal info: average of top three, unskilled, and ISCOtop.

issp <- issp %>% 
  mutate(top_income_is = (ceo_is + minister_is + dr_is)/3,
         top_income_ought = (ceo_ought + minister_ought + dr_ought)/3)

#quick checks
summary(issp$top_income_is)
summary(issp$top_income_ought)

issp %>% 
  ggplot(mapping = aes(log(top_income_is))) +
  geom_density()

issp %>% 
  filter(V7=="AU_2009") %>%
  ggplot(mapping = aes(log(top_income_is))) +
  geom_density()

#the income estimate variables are log-normal: for imputation, log-transform them
#first, replace a 0 value with 1 to avoid infinite log values
issp <- issp %>% 
  mutate(unsk_ought = ifelse(unsk_ought==0, 1, unsk_ought))
#mutate to log
issp <- issp %>% 
  mutate(
    top_income_is_log = log(top_income_is),
    top_income_ought_log = log(top_income_ought),
    unsk_is_log = log(unsk_is),
    unsk_ought_log = log(unsk_ought),
    ceo_is_log = log(ceo_is),
    ceo_ought_log = log(ceo_ought)
  )


#### Variable transformations ####

#for multiple imputation - note variables that will need to be marked as ordinal
#to follow the reasoning, use this together with zacat codebook online

#Type of society
table(issp$V60) #-> nominal
table(issp$V61) #-> nominal

#Class membership
table(issp$V66, useNA="always") #use as continuous

#intergenerational status mobility - using this as ordinal requires dropping two categories (in which respondent does not know for various reasons)
table(issp$V67, useNA="always")
issp$V67[issp$V67>5] <- NA
table(issp$V67, useNA="always")

#other demographics
issp$female <- issp$SEX==2
table(issp$SEX, issp$female, useNA="always")

summary(issp$AGE) #no need to adjust

#create a dummy for married / living as married
issp$married <- issp$MARITAL==1
table(issp$MARITAL, issp$married, useNA="always")

table(issp$DEGREE) #Use as continuous (highest category can later be used to create dummy of h.e.)

#create a dummy for self-employed and union members
issp$selfemp <- issp$SELFEMP==1
issp$union <- issp$UNION==1

#Vote choice - note these all have a lot of missingness
table(issp$PRTY_LR1, useNA="always") #nominal (from vote intention)
table(issp$PRTY_LR2, useNA="always") #nominal (from lr placement)

#voted last election - make a dummy
issp$vote_le <- issp$VOTE_LE==1

#vote choice
table(issp$VOTE_LR) #nominal

#religion - use as continuous 
table(issp$ATTEND)

#self-placement - use as continuous
table(issp$TOPBOT)



#### Multiple imputation #### 


##For imputation, retain relevant questions that have been asked at least three times
keeps <- c("V3","V7","V8","V10","V13","V14", "V26", "V27", "V28", "V32","V33",
           "V36","V37","V39","V40","V45","V46","V48","V52","V53","V55","V56","V57","V58", "V60", "V61", "V66","V67",
           "female", "AGE","married","DEGREE","selfemp","union","PRTY_LR1","PRTY_LR2","vote_le",
           "VOTE_LR","ATTEND","TOPBOT","country", "year", "ISCO_top", "top_income_is_log", "ceo_is_log", "unsk_is_log", 
           "top_income_ought_log", "ceo_ought_log", "unsk_ought_log", "income.quintile", "topcoded", "aftertaxes", "WEIGHT")
issp.small <- issp[,names(issp) %in% keeps]
rm(issp_factor, addon, issp, quintile.mat09, quintile.mat87, quintile.mat92, quintile.mat99, addonvars, identifiers, income.full, income09.mat, income87.mat, income92.mat, income99.mat)

#Impute
library(mi)
issp_missing <- missing_data.frame(issp.small)
show(issp_missing)
issp_missing <- change(issp_missing, y=c("V7", "V3", "V8", "V10", "V13", "V14", "V26", "V28", "V32", "V33", "V36", 
                                         "V37", "V40", "V45", "V46", "V48", "V52", "V53", "V55", "V56", "V57", 
                                         "V58", "V67", "PRTY_LR1", "PRTY_LR2", "VOTE_LR", "ATTEND", "WEIGHT",
                                         "country", "year", "income.quintile"), 
                                          what="type", 
                                          to=c("irrelevant", "irrelevant", "continuous", "continuous", "continuous", "continuous", 
                                               "continuous", "continuous", "continuous", "continuous", "continuous", 
                                          "continuous", "continuous", "continuous", "continuous", "continuous", "continuous", 
                                          "continuous", "continuous", "continuous", "continuous", 
                                          "continuous", "continuous", "unordered-categorical", "unordered-categorical",
                                          "unordered-categorical", "continuous", "irrelevant", 
                                          "group", "group", "continuous"))
show(issp_missing)
issp_imputed <- mi(issp_missing, n.iter = 100, n.chains = 5, max.minutes=Inf, seed=06302020, parallel=FALSE)

#check imputations
show(issp_imputed)
round(mipply(issp_imputed, mean, to.matrix = TRUE), 3)

#save this 
save(issp_imputed, file="issp_imputed.RData")

#Next step is to merge the imputed ISSP data with SWIID data (which comes already imputed at the country level) - see next R script.